Show the code
#|
if (get_count) {
oa_count$oa <- openalexR::oa_fetch(
entity = "works",
search = "",
output = "list",
count_only = TRUE,
verbose = TRUE
)$count
}Data Management Report
A short description what this is about. This is not a tracditional abstract, but rather something else …
IPBES_TCA_Ch2_technology
%The BuidNo is automatically increased by one each time the report is rendered. It is used to indicate different renderings when the version stays the same%.
All searches are done on all works in OpenAlex. The search in the TCA Corpus is not possibly at the moment, but we are working on it.
The search terms are based on the shared google doc. They are cleaned up for the usage in OpenAlex.
#|
if (get_count) {
oa_count$oa <- openalexR::oa_fetch(
entity = "works",
search = "",
output = "list",
count_only = TRUE,
verbose = TRUE
)$count
}#|
if (get_count) {
oa_count$oa_years <- openalexR::oa_fetch(
group_by = "publication_year",
output = "dataframe",
verbose = TRUE
) |>
dplyr::mutate(
publication_year = as.integer(as.character(key_display_name)),
key = NULL,
key_display_name = NULL,
p = count / sum(count)
) |>
dplyr::arrange(publication_year) |>
dplyr::mutate(
p_cum = cumsum(p)
) |>
dplyr::select(
publication_year,
everything()
)
}The search terms is vision Open Alex search.
#|
if (get_count) {
oa_count$vision <- openalexR::oa_fetch(
title_and_abstract.search = params$st_vision,
count_only = TRUE,
output = "list",
verbose = TRUE
)$count
}#|
if (get_count) {
oa_count$vision_years <- openalexR::oa_fetch(
title_and_abstract.search = params$st_vision,
group_by = "publication_year",
output = "dataframe",
verbose = TRUE
) |>
dplyr::mutate(
publication_year = as.integer(as.character(key_display_name)),
key = NULL,
key_display_name = NULL,
p = count / sum(count)
) |>
dplyr::arrange(publication_year) |>
dplyr::mutate(
p_cum = cumsum(p)
) |>
dplyr::select(
publication_year,
everything()
)
}The search terms is technology Open Alex search.
#|
if (get_count) {
oa_count$technology <- openalexR::oa_fetch(
title_and_abstract.search = compact(params$st_technology),
count_only = TRUE,
output = "list",
verbose = TRUE
)$count
}#|
if (get_count) {
oa_count$technology_years <- openalexR::oa_fetch(
title_and_abstract.search = compact(params$st_technology),
group_by = "publication_year",
output = "dataframe",
verbose = TRUE
) |>
dplyr::mutate(
publication_year = as.integer(as.character(key_display_name)),
key = NULL,
key_display_name = NULL,
p = count / sum(count)
) |>
dplyr::arrange(publication_year) |>
dplyr::mutate(
p_cum = cumsum(p)
) |>
dplyr::select(
publication_year,
everything()
)
}Open Alex search.
The search term is vision AND technology
#|
if (get_count) {
oa_count$vision_technology <- openalexR::oa_fetch(
title_and_abstract.search = compact(paste0("(", params$st_vision, ") AND (", params$st_technology, ")")),
output = "list",
count_only = TRUE,
verbose = TRUE
)$count
}#|
if (get_count) {
oa_count$vision_technology_years <- openalexR::oa_fetch(
title_and_abstract.search = compact(paste0("(", params$st_vision, ") AND (", params$st_technology, ")")),
group_by = "publication_year",
output = "dataframe",
verbose = TRUE
) |>
dplyr::mutate(
publication_year = as.integer(as.character(key_display_name)),
key = NULL,
key_display_name = NULL,
p = count / sum(count)
) |>
dplyr::arrange(publication_year) |>
dplyr::mutate(
p_cum = cumsum(p)
) |>
dplyr::select(
publication_year,
everything()
)
}#|
if (get_count) {
oa_count$vision_technology_subfields <- openalexR::oa_query(
title_and_abstract.search = compact(paste0("(", params$st_vision, ") AND (", params$st_technology, ")")),
group_by = "primary_topic.subfield.id",
verbose = TRUE
) |>
openalexR::oa_request() |>
dplyr::bind_rows() |>
dplyr::arrange(key)
## clean up missing or wrong vision_technology_subfields$key_display_name
need_cleaning <- is.na(oa_count$vision_technology_subfields$key_display_name) |
!is.na(as.numeric(oa_count$vision_technology_subfields$key_display_name))
fine <- !need_cleaning
oa_count$vision_technology_subfields <- oa_count$vision_technology_subfields |>
dplyr::filter(fine) |>
dplyr::select(key, key_display_name) |>
dplyr::distinct() |>
merge(y = oa_count$vision_technology_subfields[need_cleaning, -2], by = "key") |>
dplyr::bind_rows(oa_count$vision_technology_subfields[fine, ]) |>
dplyr::group_by(key, key_display_name) |>
dplyr::summarize(count = sum(count))
}Open Alex search.
The search term is vision AND technology
# To long search string
if (get_count) {
oa_count$vision_technology_case <- openalexR::oa_fetch(
title_and_abstract.search = compact(paste0("(", params$st_vision, ") AND (", params$st_technology, ") AND (", params$st_case, ")")),
output = "list",
count_only = TRUE,
verbose = TRUE
)$count
}#|
if (get_count) {
oa_count$vision_technology_case_years <- openalexR::oa_fetch(
title_and_abstract.search = compact(paste0("(", params$st_vision, ") AND (", params$st_technology, ") AND (", params$st_case, ")")),
group_by = "publication_year",
output = "dataframe",
verbose = TRUE
) |>
dplyr::mutate(
publication_year = as.integer(as.character(key_display_name)),
key = NULL,
key_display_name = NULL,
p = count / sum(count)
) |>
dplyr::arrange(publication_year) |>
dplyr::mutate(
p_cum = cumsum(p)
) |>
dplyr::select(
publication_year,
everything()
)
}if (get_count) {
saveRDS(oa_count, params$fn_count)
}technology AND vision Corpus from OpenAlexThe corpus download will be stored in ch2_technology/pages and the arrow database in data/ch2_technology/corpus_complete. This one will be filtered with the TCA / G;obal Corpus and get the final name data/ch2_technology/corpus.
This is not on github!
The corpus can be read by running corpus_read("data/ch2_technology/corpus") which opens the database so that then it can be fed into a dplyr pipeline. After most dplyr functions, the actual data needs to be collected via collect().
Only then is the actual data read!
Needs to be enabled by setting eval: true in the code block below.
#|
tic()
IPBES.R::corpus_download(
pages_dir = file.path(".", "data", "ch2_technology", "pages"),
title_and_abstract_search = compact(paste0("(", params$st_vision, ") AND (", params$st_technology, ")")),
continue = TRUE,
delete_pages_dir = FALSE,
set_size = 2000,
dry_run = TRUE,
verbose = TRUE,
mc_cores = 6
)
toc()The fields author and topics are serialized in the arrow database and need to be unserialized by using unserialize_arrow() on a dataset containing the two columns.
tic()
IPBES.R::corpus_pages_to_arrow(
pages_dir = params$pages_dir,
arrow_dir = params$corpus_complete_dir,
continue = TRUE,
delete_arrow_dir = FALSE,
dry_run = FALSE,
verbose = TRUE,
mc_cores = 3
)
toc()#|
tic()
if (!file.exists(params$fn_ids_tech_in_tca)) {
ids_technology <- IPBES.R::corpus_read(params$corpus_complete) |>
dplyr::select(id) |>
collect() |>
unlist()
ids_tca <- read_corpus(file.path("..", "IPBES_TCA_Corpus", "data", "tca_corpus", "corpus")) |>
dplyr::select(id) |>
collect() |>
unlist()
fn_ids_tech_in_tca <- ids_technology[ids_technology %in% ids_tca]
rm(ids_technology, ids_tca)
saveRDS(ids_tech_in_tca, params$fn_ids_tech_in_tca)
IPBES.R::corpus_filter_ids(
arrow_dir = params$corpus_complete_dir,
arrow_filter_dir = params$corpus_dir,
filter_ids = ids_tech_in_tca
)
rm(ids_tech_in_tca)
}
toc()#|
if (!file.exists(params$fn_sent_analysis_parquet)) {
corpus_read(params$corpus_dir) |>
dplyr::select(id, publication_year, ab) |>
arrow::write_parquet(params$fn_sent_analysis_parquet)
}technology AND vision in TCA Corpus#|
if (!file.exists(params$fn_random_sample_250)) {
set.seed(14)
read_corpus(params$corpus_dir) |>
dplyr::select(id, doi, author_abbr, display_name, ab) |>
dplyr::rename(abstract = ab, title = display_name) |>
dplyr::collect() |>
dplyr::slice_sample(n = 250) |>
dplyr::mutate(
abstract = substr(abstract, start = 1, stop = 5000)
) |>
writexl::write_xlsx(path = params$fn_random_sample_250)
}## | |
if (!file.exists(params$fn_publications_over_time)) {
read_corpus(params$corpus_tca_dir) |>
dplyr::select(publication_year) |>
dplyr::arrange(publication_year) |>
dplyr::collect() |>
table() |>
as.data.frame() |>
dplyr::mutate(
publication_year = as.integer(as.character(publication_year)),
p = Freq / sum(Freq),
p_cum = cumsum(p)
) |>
dplyr::rename(
count = Freq
) |>
# oa complete
dplyr::left_join(
x = oa_count$oa_years,
by = "publication_year",
suffix = c("", "_tca")
) |>
# oa vision
dplyr::left_join(
y = oa_count$vision_years,
by = "publication_year",
suffix = c("", "_vision")
) |>
# oa technology
dplyr::left_join(
y = oa_count$technology_years,
by = "publication_year",
suffix = c("", "_technology")
) |>
# oa vision technology
dplyr::left_join(
y = oa_count$vision_technology_years,
by = "publication_year",
suffix = c("", "_technology_vision")
) |>
dplyr::rename(
count_oa = count,
p_oa = p,
p_cum_oa = p_cum
) |>
saveRDS(file = params$fn_publications_over_time)
}if (length(list.files(path = dirname(params$fig_publications_over_time), pattern = basename(params$fig_publications_over_time))) < 2) {
sec_axi_fact <- 0.5e-5
figure <- readRDS(params$fn_publications_over_time) |>
dplyr::filter(publication_year >= params$temporal_from) |>
ggplot2::ggplot() +
#
ggplot2::geom_bar(ggplot2::aes(x = publication_year, y = count_tca, fill = "Nmber of publications per year in TCA Corpus"), stat = "identity") +
ggplot2::geom_line(ggplot2::aes(x = publication_year, y = p_cum_oa / sec_axi_fact, color = "Cumulative proportion OA Corpus"), size = 1.5) +
ggplot2::geom_line(ggplot2::aes(x = publication_year, y = p_cum_tca / sec_axi_fact, color = "Cumulative proportion TCA Corpus"), size = 1.5) +
ggplot2::geom_line(ggplot2::aes(x = publication_year, y = p_cum_vision / sec_axi_fact, color = "Cumulative proportion vision only corpus"), size = 1.5) +
ggplot2::geom_line(ggplot2::aes(x = publication_year, y = p_cum_technology / sec_axi_fact, color = "Cumulative proportion technology only corpus"), size = 1.5) +
ggplot2::geom_line(ggplot2::aes(x = publication_year, y = p_cum_technology_vision / sec_axi_fact, color = "Cumulative proportion Technology Corpus"), size = 1.5) +
#
ggplot2::scale_color_manual(
values = c(
"Cumulative proportion OA Corpus" = "#1f77b4",
"Cumulative proportion TCA Corpus" = "black",
"Cumulative proportion vision only corpus" = "#2ca02c",
"Cumulative proportion technology only corpus" = "#d62728",
"Cumulative proportion Technology Corpus" = "#9467bd"
)
) +
ggplot2::scale_fill_manual(
values = c("Nmber of publications per year in TCA Corpus" = "lightgrey")
) +
#
ggplot2::scale_x_continuous(breaks = seq(params$temporal_from, 2030, 10)) +
ggplot2::scale_y_continuous(
"Proportion of publications",
sec.axis = ggplot2::sec_axis(~ . * sec_axi_fact, name = "Cumulative proportion") # divide by 100 to scale back the secondary axis
) +
ggplot2::labs(
title = "Publications over time",
x = "Year",
y = "Number of publications"
) +
ggplot2::theme_minimal() +
ggplot2::theme(
axis.text.y.right = ggplot2::element_text(color = "red"),
legend.position = "inside", # Move the legend to the top left position
legend.justification = c(0.1, 0.9), # Justify the legend to the top left position
legend.background = ggplot2::element_rect(fill = "white", color = "black") # Add a white background to the legend
)
ggplot2::ggsave(
paste0(params$fig_publications_over_time, ".pdf"),
width = 12,
height = 12,
figure
)
ggplot2::ggsave(
paste0(params$fig_publications_over_time, ".png"),
width = 12,
height = 12,
figure
)
rm(figure, sec_axi_fact)
}The results are based on data downloaded or accessed at:
c_time <- list.files(
path = params$pages_dir,
recursive = TRUE,
pattern = ".rds$",
full.names = TRUE
) |>
file.mtime() |>
as.Date() |>
unique()./data/ch2_technology/corpus downloaded at 2024-03-27 from OpenAlexvision AND technology in TCA CorpusFor the TCA Corpus, we do have 544,445 number of works.
An Excel file conataining a random sample of 250 works from the Technology Corpus (technology AND vision AND nature AND transformativechange) with the fields id, doi, author_abbr and abstract of the papers. The Excel file can be downloaded from here.
The subfields are based on the main topic assigned to each work. There are other topics also assigned, but this one has been identified as the main topic by an algorythm. count is the number of works in the vision AND technology corpus which have been assigned to the subfield.
Please take a look at these subfields of the topics to identify the ones to be filtered out.
The easies would be to download the Excel file through the button and to mark the subfields to be filtered out.
IPBES.R::table_dt((oa_count$vision_technology_subfields |> dplyr::arrange(desc(count))), fixedColumns = NULL, fn = "Vision Technology Subfields")A pdf of the graph can be downloaded here.
@report{krug,
author = {Krug, Rainer M.},
title = {Report {Assessment} {Ch2} {Technology} {Visions}},
doi = {XXXXXX},
langid = {en},
abstract = {A short description what this is about. This is not a
tracditional abstract, but rather something else ...}
}